# rename columns to a same week+days format
rename_columns <- function (df) {
colnames (df) <- colnames (df) %>%
gsub ("total_umi_PreL_W0" , "Prevaccine_umi" , .) %>%
gsub ("total_umi_fraction_PreL_W0" , "Prevaccine_umi_fraction" , .) %>%
gsub ("total_umi_L2_W14" , "Postvaccine_umi" , .) %>%
gsub ("total_umi_fraction_L2_W14" , "Postvaccine_umi_fraction" , .) %>%
gsub ("total_umi_L3_W22|total_umi_W24" , "Postnivo_umi" , .) %>%
gsub ("total_umi_fraction_L3_W22|total_umi_fraction_W24" , "Postnivo_umi_fraction" , .) %>%
gsub ("total_umi_w00" , "Prevaccine_umi" , .) %>%
gsub ("total_umi_fraction_w00" , "Prevaccine_umi_fraction" , .) %>%
gsub ("total_umi_w14" , "Postvaccine_umi" , .) %>%
gsub ("total_umi_fraction_w14" , "Postvaccine_umi_fraction" , .) %>%
gsub ("total_umi_w22" , "Postnivo_umi" , .) %>%
gsub ("total_umi_fraction_w22" , "Postnivo_umi_fraction" , .) %>%
gsub ("total_umi_W16" , "w16_umi" , .) %>%
gsub ("total_umi_fraction_W16" , "w16_umi_fraction" , .) %>%
gsub ("total_umi_LATE_W40" , "w40_umi" , .) %>%
gsub ("total_umi_fraction_LATE_W40" , "w40_umi_fraction" , .) %>%
gsub ("total_umi_Pre-LL" , "Prevaccine_umi" , .) %>%
gsub ("total_umi_fraction_Pre-LL" , "Prevaccine_umi_fraction" , .) %>%
gsub ("total_umi_L2" , "Postvaccine_umi" , .) %>%
gsub ("total_umi_fraction_L2" , "Postvaccine_umi_fraction" , .) %>%
gsub ("total_umi_L3" , "Postnivo_umi" , .) %>%
gsub ("total_umi_fraction_L3" , "Postnivo_umi_fraction" , .)
return (df)
}
# Calculating LOD
calculate_LOD <- function (vector) {
# Remove zero values before calculating min
min_nonzero <- min (vector[vector > 0 ], na.rm = TRUE )
# Check if there are no nonzero values
if (is.infinite (min_nonzero)) {
return (NA )
}
# Calculate LOD
return (10 ^ floor (log10 (min_nonzero)))
}
merge_patient_data <- function (patient_id, base_path, timepoints, filename) {
# subset for specific columns of interest
select_columns <- function (file_path) {
df <- read.csv (file_path, sep = " \t " )
df <- df[,grep ("total|aaSeqCDR3|vGene|jGene|aaImputedVDJSequence" , colnames (df))]
df$ clonotype_id <- paste (df$ aaSeqCDR3, df$ vGene, df$ jGene, sep= ";" )
return (df)
}
# Helper function to select and rename columns
select_and_rename <- function (df, suffix) {
colnames (df) <- ifelse (colnames (df) != "clonotype_id" ,
paste0 (colnames (df), "_" , suffix),
colnames (df))
df <- df[, grep ("total|clonotype_id" , colnames (df))]
return (df)
}
# Prepare file paths and suffixes
dataframes <- lapply (timepoints, function (tp) {
file_path <- file.path (base_path, paste0 (filename, patient_id, "_" , tp, ".TRB.tsv" ))
df <- select_columns (file_path)
select_and_rename (df, tp)
})
# Merge all dataframes
merged_df <- Reduce (function (x, y) merge (x, y, by = "clonotype_id" , all = TRUE ), dataframes)
# Replace NA values with 0
merged_df[is.na (merged_df)] <- 0
merged_df <- rename_columns (merged_df)
merged_df$ Prevaccine_LOD <- calculate_LOD (merged_df$ Prevaccine_umi_fraction)
merged_df$ Postvaccine_LOD <- calculate_LOD (merged_df$ Postvaccine_umi_fraction)
if ("Postnivo_umi_fraction" %in% colnames (merged_df)) { # Fix: Should check for Postnivo_umi_fraction
merged_df$ Postnivo_LOD <- calculate_LOD (merged_df$ Postnivo_umi_fraction)
}
if ("w40_umi_fraction" %in% colnames (merged_df)) { # Fix: Should check for Postnivo_umi_fraction
merged_df$ w40_LOD <- calculate_LOD (merged_df$ w40_umi_fraction)
}
if (patient_id == "106" ) { # Fix: Should check for Postnivo_umi_fraction
merged_df$ w16_LOD <- calculate_LOD (merged_df$ w16_umi_fraction)
}
return (merged_df)
}
merge_patient_data_rerun <- function (patient_id, base_path, timepoints, filename) {
select_columns <- function (file_path) {
df <- read.csv (file_path, sep = " \t " )
if (grepl ("PT113" , file_path) & grepl ("w00" , file_path)) {
# Removing the first replicate of PT113, had repliacte outlier issue just for this sample.
print (file_path)
df <- df[,grep ("total|aaSeqCDR3|vGene|jGene|aaImputedVDJSequence|uniqueUMICount" , colnames (df))]
# Identify columns for replicates 2, 3, and 4 (assuming "total_" is part of the column name)
umi_columns <- grep ("uniqueUMICount" , colnames (df))
# Replace the 'total' column with the sum of UMIs from replicates 2, 3, and 4
df$ total_umi <- rowSums (df[, umi_columns[2 : 4 ]], na.rm = TRUE )
df$ total_umi_fraction <- df$ total_umi/ sum (df$ total_umi)
}
df <- df[,grep ("total|aaSeqCDR3|vGene|jGene|aaImputedVDJSequence" , colnames (df))]
df$ clonotype_id <- paste (df$ aaSeqCDR3, df$ vGene, df$ jGene, sep= ";" )
return (df)
}
# Helper function to select and rename columns
select_and_rename <- function (df, suffix) {
colnames (df) <- ifelse (colnames (df) != "clonotype_id" ,
paste0 (colnames (df), "_" , suffix),
colnames (df))
df <- df[, grep ("total|clonotype_id" , colnames (df))]
return (df)
}
# Prepare file paths and suffixes
dataframes <- lapply (timepoints, function (tp) {
file_path <- file.path (base_path, paste0 (filename, patient_id, tp, ".TRB.tsv" ))
df <- select_columns (file_path)
select_and_rename (df, tp)
})
# Merge all dataframes
merged_df <- Reduce (function (x, y) merge (x, y, by = "clonotype_id" , all = TRUE ), dataframes)
# Replace NA values with 0
merged_df[is.na (merged_df)] <- 0
merged_df <- rename_columns (merged_df)
merged_df$ Prevaccine_LOD <- calculate_LOD (merged_df$ Prevaccine_umi_fraction)
merged_df$ Postvaccine_LOD <- calculate_LOD (merged_df$ Postvaccine_umi_fraction)
merged_df$ Postnivo_LOD <- calculate_LOD (merged_df$ Postvaccine_umi_fraction)
return (merged_df)
}